Bigcontest_EDA

0. Library Packages

library(dplyr)
options(dplyr.summarise.inform = FALSE) 
library(tidyr)
library(lubridate)
library(ggplot2)
library(lubridate)
library(plotly)

1. Data load

loan_result <- read.csv("../../../data/loan_result.csv")
log_data <- read.csv("../../../data/log_data.csv")
user_spec <- read.csv("../../../data/user_spec.csv")

데이터 살펴보기

head(user_spec)
##   application_id user_id birth_year gender         insert_time credit_score
## 1        1249046  118218       1985      1 2022-06-07 06:28:18          660
## 2         954900  553686       1968      1 2022-06-07 14:29:03          870
## 3         137274   59516       1997      1 2022-06-07 21:40:22          710
## 4        1570936  167320       1989      1 2022-06-07 09:40:27          820
## 5         967833   33400       2000      1 2022-06-07 08:55:07          630
## 6        1559350  746993       1994      1 2022-06-07 09:55:03          600
##   yearly_income     income_type company_enter_month employment_type
## 1      1.08e+08 PRIVATEBUSINESS            20151101            기타
## 2      3.00e+07 PRIVATEBUSINESS            20070201          정규직
## 3      3.00e+07      FREELANCER            20210901            기타
## 4      6.20e+07    EARNEDINCOME            20170101          정규직
## 5      3.60e+07    EARNEDINCOME            20210901          정규직
## 6      3.50e+07      FREELANCER            20160401            기타
##   houseown_type desired_amount  purpose personal_rehabilitation_yn
## 1          자가          1e+06     기타                          0
## 2  기타가족소유          3e+07 대환대출                          0
## 3  기타가족소유          1e+07   생활비                          0
## 4          자가          2e+06   생활비                          0
## 5  기타가족소유          5e+06   생활비                          0
## 6  기타가족소유          5e+06   생활비                          0
##   personal_rehabilitation_complete_yn existing_loan_cnt existing_loan_amt
## 1                                  NA                 4          1.62e+08
## 2                                  NA                 1          2.70e+07
## 3                                  NA                 5          1.50e+07
## 4                                  NA                 7          3.44e+08
## 5                                   0                 1          1.60e+07
## 6                                  NA                 1                NA

데이터 결측치 확인

colSums(is.na(user_spec))
##                      application_id                             user_id 
##                                   0                                   0 
##                          birth_year                              gender 
##                               12961                               12961 
##                         insert_time                        credit_score 
##                                   0                              105115 
##                       yearly_income                         income_type 
##                                  90                                   0 
##                 company_enter_month                     employment_type 
##                              171760                                   0 
##                       houseown_type                      desired_amount 
##                                   0                                  85 
##                             purpose          personal_rehabilitation_yn 
##                                   0                              587461 
## personal_rehabilitation_complete_yn                   existing_loan_cnt 
##                             1203354                              198556 
##                   existing_loan_amt 
##                              313774

2. User_spec 데이터 전처리

2.1 birth_year & gender 결측치 처리

# 각 user_id 별로 생일과 성별 종합
user_info <- user_spec %>% 
             group_by(user_id) %>% 
             summarise(birth_year = mean(birth_year, na.rm = T), gender = mean(gender, na.rm = T)) %>% 
             ungroup()

# 모든 데이터에서 NA값인 user_id 개수 확인
colSums(is.na(user_info))
##    user_id birth_year     gender 
##          0       6856       6856
# NA 값 제거(6856명) 및 NA 채우기
na_list <- user_info$user_id[is.na(user_info$birth_year)]
user_spec2 <- user_spec[!(user_spec$user_id %in% na_list), !names(user_spec) %in% c("birth_year", "gender")] %>% left_join(user_info, by = "user_id")
colSums(is.na(user_spec2))
##                      application_id                             user_id 
##                                   0                                   0 
##                         insert_time                        credit_score 
##                                   0                              103481 
##                       yearly_income                         income_type 
##                                  89                                   0 
##                 company_enter_month                     employment_type 
##                              169287                                   0 
##                       houseown_type                      desired_amount 
##                                   0                                  84 
##                             purpose          personal_rehabilitation_yn 
##                                   0                              583589 
## personal_rehabilitation_complete_yn                   existing_loan_cnt 
##                             1195440                              193850 
##                   existing_loan_amt                          birth_year 
##                              308314                                   0 
##                              gender 
##                                   0

2.2 income_type

income_type이 공백인 값들에 “BLANK” 입력

user_spec2$income_type[user_spec2$income_type == ""] <- "BLANK"
unique(user_spec2$income_type)
## [1] "PRIVATEBUSINESS" "FREELANCER"      "EARNEDINCOME"    "OTHERINCOME"    
## [5] "EARNEDINCOME2"   "PRACTITIONER"    "BLANK"

2.3 yearly_income

Na 값을 가진 5 행 제거

user_spec3 <- user_spec2[!(is.na(user_spec2$yearly_income)),]

2.4 company_enter_month

우선 company_enter_month가 Na인 경우 0 대입, 이후 입력 형태가 YYYYMM 가 아닌 값들 수정

user_spec3$company_enter_month[is.na(user_spec3$company_enter_month)] <- 0
user_spec3$company_enter_month[user_spec3$company_enter_month > 1000000] <- user_spec3$company_enter_month[user_spec3$company_enter_month > 1000000] %/% 100

2.5 desired_amount

Na 값을 가진 74 행 제거

user_spec4 <- user_spec3[!(is.na(user_spec3$desired_amount)),]

2.6 existing_loan_cnt

기존 대출 횟수가 Na인 값은 0으로 수정

user_spec4$existing_loan_cnt[is.na(user_spec4$existing_loan_cnt)] <- 0

2.7 num to factor

num 형태의 변수 중 범주형 데이터들을 factor로 변환

user_spec4$income_type <- as.factor(user_spec4$income_type)
user_spec4$employment_type <- as.factor(user_spec4$employment_type)
user_spec4$houseown_type <- as.factor(user_spec4$houseown_type)
user_spec4$purpose <- as.factor(user_spec4$purpose)
user_spec4$gender <- as.factor(user_spec4$gender)
colSums(is.na(user_spec4))
##                      application_id                             user_id 
##                                   0                                   0 
##                         insert_time                        credit_score 
##                                   0                              103477 
##                       yearly_income                         income_type 
##                                   0                                   0 
##                 company_enter_month                     employment_type 
##                                   0                                   0 
##                       houseown_type                      desired_amount 
##                                   0                                   0 
##                             purpose          personal_rehabilitation_yn 
##                                   0                              583505 
## personal_rehabilitation_complete_yn                   existing_loan_cnt 
##                             1195356                                   0 
##                   existing_loan_amt                          birth_year 
##                              308292                                   0 
##                              gender 
##                                   0

3. User_spec 데이터 분석

3.1 birth_year

# 전체 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = birth_year, y = num)) + 
  geom_line() +
  theme_minimal() +
  scale_x_continuous(breaks = seq(min(user_spec4$birth_year), max(user_spec4$birth_year), 5)) +
  labs(title = "전체 나이 분포")
)
# 성별 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year, gender) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = birth_year, y = num, group = gender, col = gender)) + 
  geom_line() +
  theme_minimal() +
  scale_color_discrete(name = "성별") +
  labs(title = "성별 나이 분포")
)
# 수입 종류별 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year, income_type) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = birth_year, y = num, group = income_type, col = income_type)) + 
  geom_line() +
  theme_minimal() +
  scale_color_discrete(name = "수입 종류별") +
  labs(title = "수입 종류별 나이 분포")
)
# 고용 형태별 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year, employment_type) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = birth_year, y = num, group = employment_type, col = employment_type)) + 
  geom_line() +
  theme_minimal() +
  scale_color_discrete(name = "고용 형태별") +
  labs(title = "고용 형태별 나이 분포")
)
# 집 종류별 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year, houseown_type) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = birth_year, y = num, group = houseown_type, col = houseown_type)) + 
  geom_line() +
  theme_minimal() +
  scale_color_discrete(name = "집 종류별") +
  labs(title = "집 종류별 나이 분포")
)
# 목적별 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year, purpose) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = birth_year, y = num, group = purpose, col = purpose)) + 
  geom_line() +
  theme_minimal() +
  scale_color_discrete(name = "목적별") +
  labs(title = "목적별 나이 분포")
)
unique(user_spec4$gender)
## [1] 1 0
## Levels: 0 1

3.2 company_enter_month

# 연도별 입사일 도수
user_spec4 %>% mutate(year = company_enter_month %/% 100) %>% 
               group_by(year) %>% summarise(num = n()) %>% 
               filter(num > 100)
## # A tibble: 38 × 2
##     year    num
##    <dbl>  <int>
##  1     0 169199
##  2  1986    306
##  3  1987    380
##  4  1988    426
##  5  1989    557
##  6  1990    912
##  7  1991    991
##  8  1992   1281
##  9  1993   1289
## 10  1994   1356
## # … with 28 more rows
# 전체 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>% 
  group_by(company_enter_month) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = ym(company_enter_month), y = num)) +
  geom_line() +
  xlim(ym(198601), ym(202211)) +
  theme_minimal() +
  xlab("date") +
  labs(title = "전체 회사 입사일 분포")
)
# 성별 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>% 
  group_by(company_enter_month, gender) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = ym(company_enter_month), y = num, group = gender, color = gender)) +
  geom_line() +
  xlim(ym(198601), ym(202211)) +
  theme_minimal() +
  scale_color_discrete(name = "성별") +
  xlab("date") +
  labs(title = "성별 회사 입사일 분포")
)
# 수입 종류별 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>% 
  group_by(company_enter_month, income_type) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = ym(company_enter_month), y = num, group = income_type, color = income_type)) +
  geom_line() +
  xlim(ym(198601), ym(202211)) +
  theme_minimal() +
  scale_color_discrete(name = "수입 종류") +
  xlab("date") +
  labs(title = "수입 종류별 회사 입사일 분포")
)
# 고용 형태별 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>% 
  group_by(company_enter_month, employment_type) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = ym(company_enter_month), y = num, group = employment_type, col = employment_type)) + 
  geom_line() +
  theme_minimal() +
  scale_color_discrete(name = "고용 형태별") +
  xlab("date") +
  labs(title = "고용 형태별 회사 입사일 분포")
)
# 집 종류별 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>% 
  group_by(company_enter_month, houseown_type) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = ym(company_enter_month), y = num, group = houseown_type, col = houseown_type)) + 
  geom_line() +
  theme_minimal() +
  scale_color_discrete(name = "집 종류별") +
  xlab("date") +
  labs(title = "집 종류별 회사 입사일 분포")
)
# 목적별 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>% 
  group_by(company_enter_month, purpose) %>% summarise(num = n()) %>% ungroup() %>% 
  ggplot(mapping = aes(x = ym(company_enter_month), y = num, group = purpose, col = purpose)) + 
  geom_line() +
  theme_minimal() +
  scale_color_discrete(name = "목적별") +
  xlab("date") +
  labs(title = "목적별 회사 입사일 분포")
)

3.3 desired_amount